Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
debakarr
GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 1 - Data Preprocessing/[Python] Data Preprocessing.ipynb
1002 views
Kernel: Python 3

Data Preprocessing

Importing the libraries

import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split %matplotlib inline

Import the dataset

dataset = pd.read_csv('Data.csv')
dataset
X = dataset.iloc[:, :-1].values # matrix of features/independent variables
X
array([['France', 44.0, 72000.0], ['Spain', 27.0, 48000.0], ['Germany', 30.0, 54000.0], ['Spain', 38.0, 61000.0], ['Germany', 40.0, nan], ['France', 35.0, 58000.0], ['Spain', nan, 52000.0], ['France', 48.0, 79000.0], ['Germany', 50.0, 83000.0], ['France', 37.0, 67000.0]], dtype=object)
Y = dataset.iloc[:, 3].values # dependent variables
Y
array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

Taking care of missing data

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3])
X
array([['France', 44.0, 72000.0], ['Spain', 27.0, 48000.0], ['Germany', 30.0, 54000.0], ['Spain', 38.0, 61000.0], ['Germany', 40.0, 63777.77777777778], ['France', 35.0, 58000.0], ['Spain', 38.77777777777778, 52000.0], ['France', 48.0, 79000.0], ['Germany', 50.0, 83000.0], ['France', 37.0, 67000.0]], dtype=object)

Encoding categorical data

labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X
array([[0, 44.0, 72000.0], [2, 27.0, 48000.0], [1, 30.0, 54000.0], [2, 38.0, 61000.0], [1, 40.0, 63777.77777777778], [0, 35.0, 58000.0], [2, 38.77777777777778, 52000.0], [0, 48.0, 79000.0], [1, 50.0, 83000.0], [0, 37.0, 67000.0]], dtype=object)
oneHotEncoder = OneHotEncoder(categorical_features=[0]) X = oneHotEncoder.fit_transform(X).toarray()
X # 1st column is replaced by 3 column # 1st column represents France, 2nd represents Germany and 3rd Spain
array([[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01, 7.20000000e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01, 4.80000000e+04], [ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01, 5.40000000e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01, 6.10000000e+04], [ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01, 6.37777778e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01, 5.80000000e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01, 5.20000000e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01, 7.90000000e+04], [ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01, 8.30000000e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01, 6.70000000e+04]])
labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y)
Y
array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Splitting the dataset into the Training set and Test set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train
array([[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01, 5.80000000e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01, 7.20000000e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01, 7.90000000e+04], [ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01, 5.40000000e+04], [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01, 6.70000000e+04], [ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01, 6.37777778e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01, 6.10000000e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01, 5.20000000e+04]])
len(X_train)
8
X_test
array([[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01, 8.30000000e+04], [ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01, 4.80000000e+04]])
len(X_test)
2
Y_train
array([1, 0, 1, 0, 1, 1, 0, 0])
len(Y_train)
8
Y_test
array([0, 1])
len(Y_test)
2

Feature Scaling

scale_X = StandardScaler() X_train = scale_X.fit_transform(X_train) X_test = scale_X.transform(X_test)
X_train
array([[ 1. , -0.57735027, -0.57735027, -0.7529426 , -0.62603778], [ 1. , -0.57735027, -0.57735027, 1.00845381, 1.01304295], [ 1. , -0.57735027, -0.57735027, 1.79129666, 1.83258331], [-1. , 1.73205081, -0.57735027, -1.73149616, -1.09434656], [ 1. , -0.57735027, -0.57735027, -0.36152118, 0.42765698], [-1. , 1.73205081, -0.57735027, 0.22561096, 0.05040824], [-1. , -0.57735027, 1.73205081, -0.16581046, -0.27480619], [-1. , -0.57735027, 1.73205081, -0.01359102, -1.32850095]])
X_test
array([[-1. , 1.73205081, -0.57735027, 2.18271808, 2.30089209], [-1. , -0.57735027, 1.73205081, -2.3186283 , -1.79680973]])